R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(readr)
diabetes_dataset <- read_csv("diabetes_prediction_dataset.csv")
## Rows: 100000 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): gender, smoking_history
## dbl (7): age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_l...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(diabetes_dataset)

Including Plots

You can also embed plots, for example:

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# male datatset
male_data = diabetes_dataset %>% filter(gender == "Male")
# female dataset
female_data = diabetes_dataset %>% filter(gender == "Female")
female_data
## # A tibble: 58,552 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Female    36            0             0 current          23.4         5  
##  4 Female    20            0             0 never            27.3         6.6
##  5 Female    44            0             0 never            19.3         6.5
##  6 Female    79            0             0 No Info          23.9         5.7
##  7 Female    32            0             0 never            27.3         5  
##  8 Female    53            0             0 never            27.3         6.1
##  9 Female    54            0             0 former           54.7         6  
## 10 Female    78            0             0 former           36.0         5  
## # ℹ 58,542 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# males and females within original dataset that have a "normal" A1C

female_data %>% filter(HbA1c_level <= 5.7) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 27397
male_data %>% filter(HbA1c_level <= 5.7) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 18865
# count of people (male and female) with both heart disease and diabetes

diabetes_dataset %>% filter(diabetes == 1, heart_disease == 1)
## # A tibble: 1,267 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      67            0             1 not current      27.3         6.5
##  2 Male      57            1             1 not current      27.8         6.6
##  3 Male      80            0             1 former           24.4         7.5
##  4 Male      75            0             1 not current      28.1         7.5
##  5 Male      69            0             1 former           24.1         6.8
##  6 Female    59            0             1 never            60.3         8.8
##  7 Male      80            0             1 former           33.0         6  
##  8 Female    62            1             1 former           44.2         8.2
##  9 Female    62            1             1 never            43.2         8.8
## 10 Female    76            0             1 former           25.7         9  
## # ℹ 1,257 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_dataset %>% filter(diabetes == 1, heart_disease == 1) %>% tally() # this is saying how many rows are in the data group and tally ***
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1267
# count of overweight people based on bmi who have heart disease
diabetes_dataset %>% group_by(bmi >= 30) %>% filter(heart_disease == 1)
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>
diabetes_dataset %>% group_by(bmi >= 30) %>% filter(heart_disease == 1) 
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>
# "obese men" with bmi higher than 30 and that have diabetes (tally on second line)
male_data %>% filter(bmi >= 30, diabetes == 1)
## # A tibble: 1,903 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      50            0             0 former           37.2         9  
##  2 Male      53            0             0 current          30.8         6.6
##  3 Male      76            0             0 never            31.9         7.5
##  4 Male      63            1             0 ever             35.1         5.8
##  5 Male      48            1             0 current          36.1         6.8
##  6 Male      37            0             0 never            37.2         7  
##  7 Male      36            0             0 not current      46.1         6.2
##  8 Male      50            0             0 never            31.8         7.5
##  9 Male      43            0             0 never            69.4         7.5
## 10 Male      43            1             0 not current      40.9         6.6
## # ℹ 1,893 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
male_data %>% filter(bmi >= 30, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1903
# "obese women" with bmi higher than 30 and that have diabetes (tally on second line)
female_data %>% filter(bmi >= 30, diabetes == 1)
## # A tibble: 2,330 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    67            0             0 never            63.5         8.8
##  2 Female    36            0             0 current          32.3         6.2
##  3 Female    77            0             0 never            31.7         6.5
##  4 Female    47            0             0 never            36.5         7.5
##  5 Female    61            0             0 not current      39.4         9  
##  6 Female    80            0             0 former           36.2         6.5
##  7 Female    52            1             0 never            50.3         6.6
##  8 Female    68            0             0 No Info          40.3         7.5
##  9 Female    70            0             0 not current      33.2         7.5
## 10 Female    67            0             0 former           32.3         7  
## # ℹ 2,320 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
female_data %>% filter(bmi >= 30, diabetes == 1) %>% tally() # grouped by gender ***
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2330
# "underweight men"  with bmi lower than 19 and that have diabetes (tally on second line)
male_data %>% filter(bmi <= 19, diabetes == 1)
## # A tibble: 21 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      42            0             0 current          11.9         6  
##  2 Male       6            0             0 never            15.7         6.1
##  3 Male      71            1             0 former           13.2         6.6
##  4 Male      14            0             0 never            19.0         6.6
##  5 Male      54            0             0 never            18.9         6  
##  6 Male      61            1             0 never            18.4         6.5
##  7 Male       4            0             0 never            18.7         6  
##  8 Male      51            0             0 current          17.8         6.2
##  9 Male      80            1             0 current          19.0         6.6
## 10 Male       6            0             0 No Info          15.6         9  
## # ℹ 11 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
male_data %>% filter(bmi <= 19, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    21
# "underweight women"  with bmi lower than 19 and that have diabetes (tally on second line)
female_data %>% filter(bmi <= 19, diabetes == 1)
## # A tibble: 57 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    79            0             0 not current      18.1         7  
##  2 Female     4            0             0 No Info          15.0         6.5
##  3 Female    51            0             0 current          17.4         7  
##  4 Female     9            0             0 never            16           6.1
##  5 Female    60            0             0 No Info          17.9         8.2
##  6 Female    13            0             0 No Info          17.3         6.2
##  7 Female    80            0             0 never            17.4         6.5
##  8 Female     8            0             0 No Info          14.3         7.5
##  9 Female    80            0             0 never            17.8         6.2
## 10 Female    78            1             0 not current      17.7         8.8
## # ℹ 47 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
female_data %>% filter(bmi <= 19, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    57
# the assumption is that overweight people are more likely to have diabetes. Below is the code and tally of MEN who are overweight in terms of bmi and DONT have diabetes
male_data %>% filter(bmi >= 30, diabetes == 0)
## # A tibble: 7,445 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      42            0             0 never            33.6         4.8
##  2 Male      15            0             0 never            30.4         6.1
##  3 Male      40            0             0 current          36.4         6  
##  4 Male      30            0             0 never            33.8         6.1
##  5 Male      34            0             0 never            31.2         5.8
##  6 Male      54            0             0 never            31.9         6.6
##  7 Male      79            0             0 former           31.2         5.8
##  8 Male      54            0             0 former           32.8         5  
##  9 Male      38            0             0 never            55.6         6.5
## 10 Male      58            0             0 former           36.5         5.8
## # ℹ 7,435 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
male_data %>% filter(bmi >= 30, diabetes == 0) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  7445
# the assumption is that overweight people are more likely to have diabetes. Below is the code and tally of WOMEN who are overweight in terms of bmi and DONT have diabetes
female_data %>% filter(bmi >= 30, diabetes == 0)
## # A tibble: 11,852 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    54            0             0 former           54.7         6  
##  2 Female    78            0             0 former           36.0         5  
##  3 Female    53            0             0 No Info          31.8         4  
##  4 Female    34            0             0 never            56.4         6.2
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    27            0             0 not current      30.2         5.7
##  7 Female    37            0             0 No Info          30.5         5.7
##  8 Female    56            0             0 never            31.0         6.5
##  9 Female    44            0             0 never            37.4         5.7
## 10 Female    30            0             0 No Info          50.1         6  
## # ℹ 11,842 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
female_data %>% filter(bmi >= 30, diabetes == 0) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 11852

Diabetes Dataset

6 rows
gender age hypertension heart_disease smoking_history bmi HbA1c_level blood_glucose_level diabetes
Female 80 0 1 never 25.19 6.6 140 0
Female 54 0 0 No Info 27.32 6.6 80 0
Male 28 0 0 never 27.32 5.7 158 0
Female 36 0 0 current 23.45 5.0 155 0
Male 76 1 1 current 20.14 4.8 155 0
Female 20 0 0 never 27.32 6.6 85 0
diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>

Male vs. Female Blood Sugar Levels (HbA1c)

6 rows
gender age hypertension heart_disease smoking_history bmi HbA1c_level blood_glucose_level diabetes HbA1c_category
Female 80 0 1 never 25.19 6.6 140 0 Diabetes ≥ 6.5%
Female 54 0 0 No Info 27.32 6.6 80 0 Diabetes ≥ 6.5%
Male 28 0 0 never 27.32 5.7 158 0 Prediabetes 5.7% - 6.4%
Female 36 0 0 current 23.45 5.0 155 0 Normal < 5.7%
Male 76 1 1 current 20.14 4.8 155 0 Normal < 5.7%
Female 20 0 0 never 27.32 6.6 85 0 Diabetes ≥ 6.5%

Similar Prevalence of Prediabetes – The proportion of individuals categorized as having prediabetes (HbA1c 5.7% - 6.4%) is almost identical between males (41.3%) and females (41.4%). This suggests that prediabetes affects both genders at nearly the same rate.

  • Slightly Higher Diabetes Rates Among Males – More males (21.6%) fall into the diabetes (HbA1c ≥ 6.5%) category compared to females (20.2%). While the difference is small, it might indicate that men have a slightly higher risk of diabetes in this dataset.

Females Have a Slightly Higher Proportion of Normal Blood Sugar Levels – More females (38.4%) fall into the normal blood sugar category (<5.7%) compared to males (37.1%). This may indicate some slight protective factors or lifestyle differences in this group.

Since more males are in the diabetes category, there could be gender-related risk factors worth exploring—such as diet, activity levels, or genetic predisposition.

Overall, blood sugar regulation patterns appear fairly balanced between genders, but small differences suggest potential areas for further investigation.

Similar Prevalence of Prediabetes
The proportion of individuals classified as having prediabetes (HbA1c 5.7% - 6.4%) is nearly identical between males (41.3%) and females (41.4%). This suggests no significant disparity.

BMI Distribution by Hypertension Status Plot

Shows the distribution of BMI values based on hypertension status. A violin plot is great for visualizing the distribution and density of BMI across hypertension categories,

Shape and width: The width of each “violin” represents the density of BMI values at different levels. Wider sections mean more individuals have that BMI, while narrower sections indicate fewer people at those values.

Comparison of distributions: The blue violin represents people without hypertension (hypertension = 0), while the red violin represents those with hypertension (hypertension = 1). By comparing them, you can see how BMI differs between these groups.

The horizontal line around 25 BMI: This marks the median BMI for each group. Since both violins have a horizontal line in roughly the same position, it suggests that the median BMI is around 25 for both hypertensive and non-hypertensive individuals.

Density trends: If the violins have different thicknesses in certain BMI ranges, it tells you which BMI values are more or less common in each group. People with hypertension seem to have a higher BMI overall, but both groups share a similar median.

The distribution shape is different—for example, if one violin is wider at higher BMI values, it suggests that hypertension is more common among individuals with higher BMI.

Outliers or extreme values might appear as small bulges or extended tails at the ends of the violins, showing individuals with very high or low BMI.

Smokers go brrr

In the smoking data there are 6 unique values

  1. Never: Has Never smoked
  2. Not current: Has smoked but is not currently smoking
  3. Former: Has quit smoking (abstained for longer than)
  4. Current: Is currently a smoker
  5. Ever: Has ever smoked regardless of current smoking status
  6. No Info: No smoking history information available

The total amount of people who fall into each category is as follows;

  1. Never: 35095
  2. Not current: 6447
  3. Former: 9352
  4. Current: 9286
  5. Ever: 4004
  6. No Info: 35816

There is quite a sizable amount of people in the No info category.

The total number of people in the dataset is 100000. To help clean up the data, we can filter ‘No Info’ people out. When we do that we get 64184.

# Figure out the unique categories of smoking history 
unique(diabetes_dataset$smoking_history)
## [1] "never"       "No Info"     "current"     "former"      "ever"       
## [6] "not current"
# Count amount of people who belong to each unique category
diabetes_dataset %>% group_by(smoking_history) %>% summarise(total_people = n())
## # A tibble: 6 × 2
##   smoking_history total_people
##   <chr>                  <int>
## 1 No Info                35816
## 2 current                 9286
## 3 ever                    4004
## 4 former                  9352
## 5 never                  35095
## 6 not current             6447
smoking_diabetes_dataset <- diabetes_dataset %>% 
  filter(smoking_history != 'No Info') %>%  
  group_by(smoking_history, diabetes) %>%  
  summarise(total = n())
## `summarise()` has grouped output by 'smoking_history'. You can override using
## the `.groups` argument.

Now we can graph the relationship between

library(dplyr)
library(ggplot2)

# Data prep
bmi_heart_gender <- diabetes_dataset %>%
  filter(heart_disease == 1) %>%
  mutate(
    bmi_category = case_when(
      bmi <= 19 ~ "Underweight",
      bmi >= 30 ~ "Overweight",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(bmi_category)) %>%
  group_by(gender, bmi_category) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(gender) %>%
  mutate(percentage = round(100 * count / sum(count), 1))

# Make sure the bmi_category is an ordered factor so the line connects correctly
bmi_heart_gender$bmi_category <- factor(bmi_heart_gender$bmi_category,
                                        levels = c("Underweight", "Overweight"))

# Dot + Line plot
ggplot(bmi_heart_gender, aes(x = bmi_category, y = count, group = gender, color = gender)) +
  geom_line(aes(group = gender), position = position_dodge(width = 0.5), linewidth = 1) +
  geom_point(size = 5, position = position_dodge(width = 0.5)) +
  geom_text(aes(label = paste0(count, " (", percentage, "%)")),
            vjust = -1,
            position = position_dodge(width = 0.5)) +
  labs(
    title = "Heart Disease Cases by Gender and BMI Category (Dot-Line Plot)",
    x = "BMI Category",
    y = "Count of People with Heart Disease",
    color = "Gender"
  ) +
  theme_minimal()

library(dplyr)
library(ggplot2)

# Prepare the data
bmi_heart_gender <- diabetes_dataset %>%
  filter(heart_disease == 1) %>%
  mutate(
    bmi_category = case_when(
      bmi <= 19 ~ "Underweight",
      bmi >= 30 ~ "Overweight",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(bmi_category)) %>%
  group_by(gender, bmi_category) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(gender) %>%
  mutate(percentage = round(100 * count / sum(count), 1))

# Plot
ggplot(bmi_heart_gender, aes(x = bmi_category, y = count, fill = gender)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
  geom_text(aes(label = paste0(count, " (", percentage, "%)")),
            vjust = -0.5,
            position = position_dodge(width = 0.7)) +
  labs(
    title = "Count and Percentage of Heart Disease Cases by Gender and BMI Category",
    x = "BMI Category",
    y = "Count of People with Heart Disease",
    fill = "Gender"
  ) +
  theme_minimal()

# load needed packages
library(dplyr)
library(ggplot2)

# 1. Summarise men
men_summary <- male_data %>%
  summarise(
    count = sum(bmi >= 30 & diabetes == 0),
    total = n()
  ) %>%
  mutate(
    sex        = "Men",
    percent    = count / total * 100
  )

# 2. Summarise women
women_summary <- female_data %>%
  summarise(
    count = sum(bmi >= 30 & diabetes == 0),
    total = n()
  ) %>%
  mutate(
    sex        = "Women",
    percent    = count / total * 100
  )

# 3. Combine into one data frame
summary_df <- bind_rows(men_summary, women_summary)

# 4. Plot
ggplot(summary_df, aes(x = sex, y = count, fill = sex)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = sprintf("%d (%.1f%%)", count, percent)),
            vjust = -0.5, size = 4) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Count of Overweight (BMI ≥ 30) without diabetes, by Sex",
    x     = NULL,
    y     = "Number of Individuals",
    fill  = NULL
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")